#include "statsxx/machine_learning/Ensemble.hpp"

// STL
#include <algorithm>           // std::max()
#include <cmath>               // std::log(), ::exp()
#include <fstream>             // std::ofstream
#include <limits>              // std::numeric_limits<>::lowest()
#include <string>              // std::string
#include <vector>              // std::vector<>

// jScience
#include "jScience/linalg.hpp" // Matrix<>, Vector<>, normalize()

// stats++
#include "statsxx/machine_learning/loss_functions.hpp" // Brier_score()


//
// DESC: Use Bayesian Model Averaging (BMA) to determine the weightings of each Learner.
//
// -----
//
// NOTE: BMA seeks to approximate the Bayes Optimal Classifier.
//
// NOTE: This algorithm often (but not always!) performs less well than BMC.
//
// NOTE: ... The issue is that it often gives nearly all of the weight towards only few of the models.
//
// -----
//
// NOTE: The ``predictive accuracy'' of each learner has to be calculated.
//
// NOTE: ... The Brier score (a proper scoring rule for the accuracy of probabilistic predictions) is used for this.
//
// -----
//
// NOTE: See:
//
//     Hoeting, J. A.; Madigan, D.; Raftery, A. E.; Volinsky, C. T. (1999). ``Bayesian Model Averaging: A Tutorial''. Statistical Science 14 (4): 382–401
//
// NOTE: ... And also an earlier version of the Wikipedia page:
//
//     https://en.wikipedia.org/wiki/Ensemble_learning
//
// NOTE: See also:
//
//     T. P. Minka, ``Bayesian model averaging is not model combination'' (2002).
//
// NOTE: See also the discussion about BMA in:
//
//     Monteith, K. et al., ``Turning Bayesian Model Averaging into Bayesian Model Combination,'' Proceedings of the International Joint Conference on Neural Networks IJCNN'11. pp. 2657–2663.
//
// -----
//
// NOTE: The implementation below is based on the aforementioned Wikipedia reference (which is not longer shown online).
//
// NOTE: ... Based on a comparison with ::optimize_BMC(), this algorithm appears understandably correct.
//
template<typename T>
inline void Ensemble<T>::optimize_BMA(
                                      const Matrix<double> &X_in,
                                      const Matrix<double> &X_out,
                                      // -----
                                      const std::string     prefix
                                      )
{
    //=========================================================
    // INITIALIZATION
    //=========================================================

    // SETUP OUTPUT FILE
    std::ofstream ofs( (prefix + ".BMA.dat") );

    double z = std::numeric_limits<double>::lowest(); // used to maintain numerical stability

    // SET PRIORS
    //
    // NOTE: Below, uniform priors are used.
    std::vector<double> prior(this->learners.size(), 1.);

    // CALCULATE OUTPUT FROM INDIVIDUAL LEARNERS
    //
    // NOTE: This is used in several instances below.
    //
    // TODO: NOTE: Maybe the following functionality is to go elsewhere.

    std::vector<Matrix<double>> X_out_i(this->learners.size(), Matrix<double>(X_in.size(0),X_out.size(1)));

    for( auto i = 0; i < this->learners.size(); ++i )
    {
        for( auto j = 0; j < X_in.size(0); ++j )
        {
            std::vector<double> output = this->learners[i].f_evaluate(X_in.row(j).std_vector());

            for( auto k = 0; k < X_out.size(1); ++k )
            {
                X_out_i[i](j,k) = output[k];
            }
        }
    }

    // CALCULATE INITIAL PREDICTIVE ACCURACY
    Matrix<double> output = this->combine_output(
                                                 X_out_i
                                                 );

    double x = 1. - loss_function::Brier_score(
                                               output,
                                               X_out
                                               );

    ofs << "Initial predictive accuracy (1. - Brier_score): " << x << '\n';

    //=========================================================
    // SAMPLE (HYPOTHESIS SPACE), BY EACH LEARNER
    //=========================================================

    //---------------------------------------------------------
    // CALCULATE THE LIKELIHOOD OF EACH MODEL (Learner), GIVEN THE DATA
    //---------------------------------------------------------

    std::vector<double> log_likelihood(this->learners.size());

    for( auto i = 0; i < this->learners.size(); ++i )
    {
        // CALCULATE THE PROBABILITY OF CLASS CORRUPTION
        //
        // NOTE: This can be estimated by the 1 - predictive accuracy, or average error rate (i.e., per example) of the model on the data.
        //
        // NOTE: See the NOTE in the introduction above about this.

        // error rate for predicting the data
        double eps = loss_function::Brier_score(
                                                X_out_i[i],
                                                X_out
                                                );

        // ... AND USE THIS TO *ESTIMATE* THE LOG-LIKELIHOOD
        log_likelihood[i] = X_in.size(0)*( eps*std::log(eps) + (1. - eps)*std::log(1. - eps) );

        // NOTE: This formula can be understood by expanding:
        //
        //     exp(log_likelihood) = exp[n*( eps*log(eps) + (1. - eps)*log(1. - eps) )]
        //                         = exp[n*eps*log(eps)]*exp[n*(1. - eps)*log(1. - eps)]
        //                         = eps^(n*eps)*(1. - eps)^(n - n*eps)
        //
        //     where n = X_in.size(0) is the number of training examples.
        //
        // ... Taking n*eps and (n - n*eps) to be the total number of examples incorrectly and correctly classified, respectively, ...
        //
        // ... the likelihood formula in Eq. (3) in Monteith et al. is obtained (for BMA).
        //
        // ... Note that the probability of each example p(h) [in Eq. (3)] is assumed equal over the training data.

        // NOTE: Using the log_likelihood directly often leads to 0 likelihood.
        //
        // NOTE: ... z is used to maintain numerical stability, with respect to this.
        z = std::max(z, log_likelihood[i]);
    }

    //---------------------------------------------------------
    // CALCULATE WEIGHTS
    //---------------------------------------------------------

    for( auto i = 0; i < this->learners.size(); ++i )
    {
        this->w[i] = prior[i]*std::exp(log_likelihood[i] - z);
    }

    // NORMALIZE WEIGHTS
    this->w = normalize(Vector<double>(this->w)).std_vector();

    //=========================================================
    // POST PROCESS
    //=========================================================

    // CALCULATE FINAL PREDICTIVE ACCURACY
    output = this->combine_output(
                                  X_out_i
                                  );

    x = 1. - loss_function::Brier_score(
                                        output,
                                        X_out
                                        );

    // NOTE: The spacing in the following is purposeful (to align with the initial output above).
    ofs << "Final predictive accuracy (1. - Brier_score)  : " << x << '\n';

    //=========================================================
    // CLEANUP AND RETURN
    //=========================================================

    ofs.close();
}
